{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Configuring pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# import numpy and pandas\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# used for dates\n",
    "import datetime\n",
    "from datetime import datetime, date\n",
    "\n",
    "# Set some pandas options controlling output format\n",
    "pd.set_option('display.notebook_repr_html', False)\n",
    "pd.set_option('display.max_columns', 8)\n",
    "pd.set_option('display.max_rows', 10)\n",
    "pd.set_option('display.width', 60)\n",
    "\n",
    "# bring in matplotlib for graphics\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Concatenating data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0\n",
       "1    1\n",
       "2    2\n",
       "dtype: int64"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# two Series objects to concatenate\n",
    "s1 = pd.Series(np.arange(0, 3))\n",
    "s2 = pd.Series(np.arange(5, 8))\n",
    "s1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    5\n",
       "1    6\n",
       "2    7\n",
       "dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0\n",
       "1    1\n",
       "2    2\n",
       "0    5\n",
       "1    6\n",
       "2    7\n",
       "dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# concatenate them\n",
    "pd.concat([s1, s2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   a  b  c\n",
       "0  0  1  2\n",
       "1  3  4  5\n",
       "2  6  7  8"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create two DataFrame objects to concatenate\n",
    "# using the same index labels and column names, \n",
    "# but different values\n",
    "df1 = pd.DataFrame(np.arange(9).reshape(3, 3), \n",
    "                   columns=['a', 'b', 'c'])\n",
    "#df2 has 9 .. 18\n",
    "df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3), \n",
    "                   columns=['a', 'b', 'c'])\n",
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "    a   b   c\n",
       "0   9  10  11\n",
       "1  12  13  14\n",
       "2  15  16  17"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "    a   b   c\n",
       "0   0   1   2\n",
       "1   3   4   5\n",
       "2   6   7   8\n",
       "0   9  10  11\n",
       "1  12  13  14\n",
       "2  15  16  17"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# do the concat\n",
    "pd.concat([df1, df2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   a  b  c\n",
       "0  0  1  2\n",
       "1  3  4  5\n",
       "2  6  7  8"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# demonstrate concatenating two DataFrame objects with\n",
    "# different columns\n",
    "df1 = pd.DataFrame(np.arange(9).reshape(3, 3), \n",
    "                   columns=['a', 'b', 'c'])\n",
    "df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3), \n",
    "                   columns=['a', 'c', 'd'])\n",
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "    a   c   d\n",
       "0   9  10  11\n",
       "1  12  13  14\n",
       "2  15  16  17"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "    a    b   c     d\n",
       "0   0  1.0   2   NaN\n",
       "1   3  4.0   5   NaN\n",
       "2   6  7.0   8   NaN\n",
       "0   9  NaN  10  11.0\n",
       "1  12  NaN  13  14.0\n",
       "2  15  NaN  16  17.0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# do the concat, NaN's will be filled in for\n",
    "# the d column for df1 and b column for df2\n",
    "pd.concat([df1, df2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "        a    b   c     d\n",
       "df1 0   0  1.0   2   NaN\n",
       "    1   3  4.0   5   NaN\n",
       "    2   6  7.0   8   NaN\n",
       "df2 0   9  NaN  10  11.0\n",
       "    1  12  NaN  13  14.0\n",
       "    2  15  NaN  16  17.0"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# concat the two objects, but create an index using the\n",
    "# given keys \n",
    "c = pd.concat([df1, df2], keys=['df1', 'df2'])\n",
    "# note in the labeling of the rows in the output\n",
    "c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "    a   b   c     d\n",
       "0   9 NaN  10  11.0\n",
       "1  12 NaN  13  14.0\n",
       "2  15 NaN  16  17.0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we can extract the data originating from\n",
    "# the first or second source DataFrame\n",
    "c.loc['df2']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   a  b  c   a   c   d\n",
       "0  0  1  2   9  10  11\n",
       "1  3  4  5  12  13  14\n",
       "2  6  7  8  15  16  17"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# concat df1 and df2 along columns\n",
    "# aligns on row labels, has duplicate columns\n",
    "pd.concat([df1, df2], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "    a   d\n",
       "2  20  21\n",
       "3  22  23\n",
       "4  24  25"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# a new DataFrame to merge with df1\n",
    "# this has two common row labels (2, 3) \n",
    "# common columns (a) and one disjoint column\n",
    "# in each (b in df1 and d in df2)\n",
    "df3 = pd.DataFrame(np.arange(20, 26).reshape(3, 2), \n",
    "                   columns=['a', 'd'], \n",
    "                   index=[2, 3, 4])\n",
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     a    b    c     a     d\n",
       "0  0.0  1.0  2.0   NaN   NaN\n",
       "1  3.0  4.0  5.0   NaN   NaN\n",
       "2  6.0  7.0  8.0  20.0  21.0\n",
       "3  NaN  NaN  NaN  22.0  23.0\n",
       "4  NaN  NaN  NaN  24.0  25.0"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# concat them. Alignment is along row labels\n",
    "# columns first from df1 and then df3, with duplicates.\n",
    "# NaN filled in where those columns do not exist in the source\n",
    "pd.concat([df1, df3], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   a  b  c   a   d\n",
       "2  6  7  8  20  21"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# do an inner join instead of outer\n",
    "# results in one row\n",
    "pd.concat([df1, df3], axis=1, join='inner')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "  df1       df2        \n",
       "    a  b  c   a   c   d\n",
       "0   0  1  2   9  10  11\n",
       "1   3  4  5  12  13  14\n",
       "2   6  7  8  15  16  17"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# add keys to the columns\n",
    "df = pd.concat([df1, df2], \n",
    "               axis=1,\n",
    "               keys=['df1', 'df2'])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "    a   c   d\n",
       "0   9  10  11\n",
       "1  12  13  14\n",
       "2  15  16  17"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# retrieve the data that originated from the \n",
    "# DataFrame with key 'df2'\n",
    "df.loc[:, 'df2']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "    a    b   c     d\n",
       "0   0  1.0   2   NaN\n",
       "1   3  4.0   5   NaN\n",
       "2   6  7.0   8   NaN\n",
       "0   9  NaN  10  11.0\n",
       "1  12  NaN  13  14.0\n",
       "2  15  NaN  16  17.0"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# append does a concatenate along axis=0 \n",
    "# duplicate row index labels can result\n",
    "df1.append(df2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "    a    b   c     d\n",
       "0   0  1.0   2   NaN\n",
       "1   3  4.0   5   NaN\n",
       "2   6  7.0   8   NaN\n",
       "3   9  NaN  10  11.0\n",
       "4  12  NaN  13  14.0\n",
       "5  15  NaN  16  17.0"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# remove duplicates in the result index by ignoring the \n",
    "# index labels in the source DataFrame objects\n",
    "df1.append(df2, ignore_index=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# An overview of merges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "              Address  CustomerID    Name\n",
       "0    Address for Mike          10    Mike\n",
       "1  Address for Marcia          11  Marcia"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# these are our customers\n",
    "customers = {'CustomerID': [10, 11],\n",
    "             'Name': ['Mike', 'Marcia'],\n",
    "             'Address': ['Address for Mike',\n",
    "                         'Address for Marcia']}\n",
    "customers = pd.DataFrame(customers)\n",
    "customers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   CustomerID   OrderDate\n",
       "0          10  2014-12-01\n",
       "1          11  2014-12-01\n",
       "2          10  2014-12-01"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# and these are the orders made by our customers\n",
    "# they are related to customers by CustomerID\n",
    "orders = {'CustomerID': [10, 11, 10],\n",
    "          'OrderDate': [date(2014, 12, 1),\n",
    "                        date(2014, 12, 1),\n",
    "                        date(2014, 12, 1)]}\n",
    "orders = pd.DataFrame(orders)\n",
    "orders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "              Address  CustomerID    Name   OrderDate\n",
       "0    Address for Mike          10    Mike  2014-12-01\n",
       "1    Address for Mike          10    Mike  2014-12-01\n",
       "2  Address for Marcia          11  Marcia  2014-12-01"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# merge customers and orders so we can ship the items\n",
    "customers.merge(orders)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "  key1 key2  lval1\n",
       "0    a    x      0\n",
       "1    b    y      1\n",
       "2    c    z      2"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# data to be used in the remainder of this section's examples\n",
    "left_data = {'key1': ['a', 'b', 'c'], \n",
    "            'key2': ['x', 'y', 'z'],\n",
    "            'lval1': [ 0, 1, 2]}\n",
    "right_data = {'key1': ['a', 'b', 'c'],\n",
    "              'key2': ['x', 'a', 'z'], \n",
    "              'rval1': [ 6, 7, 8 ]}\n",
    "left = pd.DataFrame(left_data, index=[0, 1, 2])\n",
    "right = pd.DataFrame(right_data, index=[1, 2, 3])\n",
    "left"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "  key1 key2  rval1\n",
       "1    a    x      6\n",
       "2    b    a      7\n",
       "3    c    z      8"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "right"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "  key1 key2  lval1  rval1\n",
       "0    a    x      0      6\n",
       "1    c    z      2      8"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# demonstrate merge without specifying columns to merge\n",
    "# this will implicitly merge on all common columns\n",
    "left.merge(right)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "  key1 key2_x  lval1 key2_y  rval1\n",
       "0    a      x      0      x      6\n",
       "1    b      y      1      a      7\n",
       "2    c      z      2      z      8"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# demonstrate merge using an explicit column\n",
    "# on needs the value to be in both DataFrame objects\n",
    "left.merge(right, on='key1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "  key1 key2  lval1  rval1\n",
       "0    a    x      0      6\n",
       "1    c    z      2      8"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# merge explicitly using two columns\n",
    "left.merge(right, on=['key1', 'key2'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "  key1_x key2_x  lval1 key1_y key2_y  rval1\n",
       "1      b      y      1      a      x      6\n",
       "2      c      z      2      b      a      7"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# join on the row indices of both matrices\n",
    "pd.merge(left, right, left_index=True, right_index=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Specifying the join semantics of a merge operation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "  key1 key2  lval1  rval1\n",
       "0    a    x    0.0    6.0\n",
       "1    b    y    1.0    NaN\n",
       "2    c    z    2.0    8.0\n",
       "3    b    a    NaN    7.0"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# outer join, merges all matched data, \n",
    "# and fills unmatched items with NaN\n",
    "left.merge(right, how='outer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "  key1 key2  lval1  rval1\n",
       "0    a    x      0    6.0\n",
       "1    b    y      1    NaN\n",
       "2    c    z      2    8.0"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# left join, merges all matched data, and only fills unmatched \n",
    "# items from the left dataframe with NaN filled for the \n",
    "# unmatched items in the result \n",
    "# rows with labels 0 and 2 \n",
    "# match on key1 and key2 the row with label 1 is from left\n",
    "\n",
    "left.merge(right, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "  key1 key2  lval1  rval1\n",
       "0    a    x    0.0      6\n",
       "1    c    z    2.0      8\n",
       "2    b    a    NaN      7"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# right join, merges all matched data, and only fills unmatched\n",
    "# item from the right with NaN filled for the unmatched items\n",
    "# in the result \n",
    "# rows with labels 0 and 2 match on key1 and key2\n",
    "# the row with label 1 is from right\n",
    "left.merge(right, how='right')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "  key1_left key2_left  lval1 key1_right key2_right  rval1\n",
       "0         a         x      0        NaN        NaN    NaN\n",
       "1         b         y      1          a          x    6.0\n",
       "2         c         z      2          b          a    7.0"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# join left with right (default method is outer)\n",
    "# and since these DataFrame objects have duplicate column names\n",
    "# we just specify lsuffix and rsuffix\n",
    "left.join(right, lsuffix='_left', rsuffix='_right')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "  key1_left key2_left  lval1 key1_right key2_right  rval1\n",
       "1         b         y      1          a          x      6\n",
       "2         c         z      2          b          a      7"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# join left with right with an inner join\n",
    "left.join(right, lsuffix='_left', rsuffix='_right', how='inner')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pivoting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "    interval axis  reading\n",
       "0          0    X      0.0\n",
       "1          0    Y      0.5\n",
       "2          0    Z      1.0\n",
       "3          1    X      0.1\n",
       "4          1    Y      0.4\n",
       "..       ...  ...      ...\n",
       "7          2    Y      0.3\n",
       "8          2    Z      0.8\n",
       "9          3    X      0.3\n",
       "10         3    Y      0.2\n",
       "11         3    Z      0.7\n",
       "\n",
       "[12 rows x 3 columns]"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# read in accelerometer data\n",
    "sensor_readings = pd.read_csv(\"data/accel.csv\")\n",
    "sensor_readings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   interval axis  reading\n",
       "0         0    X      0.0\n",
       "3         1    X      0.1\n",
       "6         2    X      0.2\n",
       "9         3    X      0.3"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# extract X-axis readings\n",
    "sensor_readings[sensor_readings['axis'] == 'X']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "axis        X    Y    Z\n",
       "interval               \n",
       "0         0.0  0.5  1.0\n",
       "1         0.1  0.4  0.9\n",
       "2         0.2  0.3  0.8\n",
       "3         0.3  0.2  0.7"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# pivot the data.  Interval becomes the index, the columns are\n",
    "# the current axes values, and use the readings as values\n",
    "sensor_readings.pivot(index='interval', \n",
    "                     columns='axis', \n",
    "                     values='reading')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Stacking using non-hierarchical indexes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     a\n",
       "one  1\n",
       "two  2"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# simple DataFrame with one column\n",
    "df = pd.DataFrame({'a': [1, 2]}, index={'one', 'two'})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "one  a    1\n",
       "two  a    2\n",
       "dtype: int64"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# push the column to another level of the index\n",
    "# the result is a Series where values are looked up through\n",
    "# a multi-index\n",
    "stacked1 = df.stack()\n",
    "stacked1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# lookup one / a using just the index via a tuple\n",
    "stacked1[('one', 'a')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     a  b\n",
       "one  1  3\n",
       "two  2  4"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# DataFrame with two columns\n",
    "df = pd.DataFrame({'a': [1, 2],\n",
    "                   'b': [3, 4]}, \n",
    "                  index={'one', 'two'})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "one  a    1\n",
       "     b    3\n",
       "two  a    2\n",
       "     b    4\n",
       "dtype: int64"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# push the two columns into a single level of the index\n",
    "stacked2 = df.stack()\n",
    "stacked2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# lookup value with index of one / b\n",
    "stacked2[('one', 'b')]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Unstacking using hierarchical indexes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "                      reading\n",
       "who    interval axis         \n",
       "Mike   0        X         0.0\n",
       "                Y         0.5\n",
       "                Z         1.0\n",
       "       1        X         0.1\n",
       "                Y         0.4\n",
       "...                       ...\n",
       "Mikael 2        Y        30.0\n",
       "                Z        80.0\n",
       "       3        X        30.0\n",
       "                Y        20.0\n",
       "                Z        70.0\n",
       "\n",
       "[24 rows x 1 columns]"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make two copies of the sensor data, one for each user\n",
    "user1 = sensor_readings.copy()\n",
    "user2 = sensor_readings.copy()\n",
    "# add names to the two copies\n",
    "user1['who'] = 'Mike'\n",
    "user2['who'] = 'Mikael'\n",
    "# for demonstration, lets scale user2's readings\n",
    "user2['reading'] *= 100\n",
    "# and reorganize this to have a hierarchical row index\n",
    "multi_user_sensor_data = pd.concat([user1, user2]) \\\n",
    "            .set_index(['who', 'interval', 'axis'])\n",
    "multi_user_sensor_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "               reading\n",
       "interval axis         \n",
       "0        X         0.0\n",
       "         Y         0.5\n",
       "         Z         1.0\n",
       "1        X         0.1\n",
       "         Y         0.4\n",
       "...                ...\n",
       "2        Y         0.3\n",
       "         Z         0.8\n",
       "3        X         0.3\n",
       "         Y         0.2\n",
       "         Z         0.7\n",
       "\n",
       "[12 rows x 1 columns]"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# lookup user data for Mike using just the index\n",
    "multi_user_sensor_data.loc['Mike']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "             reading\n",
       "who    axis         \n",
       "Mike   X         0.1\n",
       "       Y         0.4\n",
       "       Z         0.9\n",
       "Mikael X        10.0\n",
       "       Y        40.0\n",
       "       Z        90.0"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# readings for all users and axes at interval 1\n",
    "multi_user_sensor_data.xs(1, level='interval')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "                reading             \n",
       "axis                  X     Y      Z\n",
       "who    interval                     \n",
       "Mikael 0            0.0  50.0  100.0\n",
       "       1           10.0  40.0   90.0\n",
       "       2           20.0  30.0   80.0\n",
       "       3           30.0  20.0   70.0\n",
       "Mike   0            0.0   0.5    1.0\n",
       "       1            0.1   0.4    0.9\n",
       "       2            0.2   0.3    0.8\n",
       "       3            0.3   0.2    0.7"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# unstack the who level\n",
    "multi_user_sensor_data.unstack()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "              reading     \n",
       "who            Mikael Mike\n",
       "interval axis             \n",
       "0        X        0.0  0.0\n",
       "         Y       50.0  0.5\n",
       "         Z      100.0  1.0\n",
       "1        X       10.0  0.1\n",
       "         Y       40.0  0.4\n",
       "...               ...  ...\n",
       "2        Y       30.0  0.3\n",
       "         Z       80.0  0.8\n",
       "3        X       30.0  0.3\n",
       "         Y       20.0  0.2\n",
       "         Z       70.0  0.7\n",
       "\n",
       "[12 rows x 2 columns]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# unstack at level=0\n",
    "multi_user_sensor_data.unstack(level=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "         reading                              \n",
       "who         Mike           Mikael             \n",
       "axis           X    Y    Z      X     Y      Z\n",
       "interval                                      \n",
       "0            0.0  0.5  1.0    0.0  50.0  100.0\n",
       "1            0.1  0.4  0.9   10.0  40.0   90.0\n",
       "2            0.2  0.3  0.8   20.0  30.0   80.0\n",
       "3            0.3  0.2  0.7   30.0  20.0   70.0"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# unstack who and axis levels\n",
    "unstacked = multi_user_sensor_data.unstack(['who', 'axis'])\n",
    "unstacked"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "                reading             \n",
       "axis                  X     Y      Z\n",
       "interval who                        \n",
       "0        Mikael     0.0  50.0  100.0\n",
       "         Mike       0.0   0.5    1.0\n",
       "1        Mikael    10.0  40.0   90.0\n",
       "         Mike       0.1   0.4    0.9\n",
       "2        Mikael    20.0  30.0   80.0\n",
       "         Mike       0.2   0.3    0.8\n",
       "3        Mikael    30.0  20.0   70.0\n",
       "         Mike       0.3   0.2    0.7"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# and we can of course stack what we have unstacked\n",
    "# this re-stacks who\n",
    "unstacked.stack(level='who')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Melting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   Height    Name  Weight\n",
       "0     6.1    Mike     220\n",
       "1     6.0  Mikael     185"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we will demonstrate melting with this DataFrame\n",
    "data = pd.DataFrame({'Name' : ['Mike', 'Mikael'],\n",
    "                     'Height' : [6.1, 6.0],\n",
    "                     'Weight' : [220, 185]})\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     Name variable  value\n",
       "0    Mike   Height    6.1\n",
       "1  Mikael   Height    6.0\n",
       "2    Mike   Weight  220.0\n",
       "3  Mikael   Weight  185.0"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# melt it, use Name as the id's, \n",
    "# Height and Weight columns as the variables\n",
    "pd.melt(data, \n",
    "        id_vars=['Name'],\n",
    "        value_vars=['Height', 'Weight'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Performance benefits of stacked data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.5042991369991796, 1.2535194699958083, 0.1478830999985803)"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# stacked scalar access can be a lot faster than\n",
    "# column access\n",
    "\n",
    "# time the different methods\n",
    "import timeit\n",
    "t = timeit.Timer(\"stacked1[('one', 'a')]\", \n",
    "                 \"from __main__ import stacked1, df\")\n",
    "r1 = timeit.timeit(lambda: stacked1.loc[('one', 'a')], \n",
    "                   number=10000)\n",
    "r2 = timeit.timeit(lambda: df.loc['one']['a'], \n",
    "                   number=10000)\n",
    "r3 = timeit.timeit(lambda: df.iloc[1, 0], \n",
    "                   number=10000)\n",
    "\n",
    "# and the results are...  Yes, it's the fastest of the three\n",
    "r1, r2, r3"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
